/****************************************************************************************************
 * This .do file performs the following tasks:
 * 
 * 1. Imports raw individual productivity data from paper bag making task
 * 2. Cleans data:
 *    - Drops unnecessary variables
 *    - Reshapes data which is necessary because factory managers entered data by production floor
 *    - Merges with religion identifier 
 *    - Fixes ids enteered wrongly by managers
 *    - Saves for Analysis
 ****************************************************************************************************/
 
pause on
clear all


* Load Data
use $Data/Original/individual_productivity_raw.dta, clear

* Create Stata time
generate double starttime = clock(startdate, "YMDhms")
format starttime %tc

* Convert to IST 
replace starttime = starttime + 12.5*60*60*1000
g date = dofc(starttime)
format date %d
order date starttime

* Double-check date
g day = substr(q8,1,2)
g month = substr(q8,4,2)
g year = substr(q8,7,4)
destring day month year, replace
g datecheck = mdy(month,day,year)
drop day month year
assert date==datecheck
drop datecheck q8

* Fix duration
destring durationinseconds, replace
g duration_mins = durationinseconds/60
la var duration_mins "Survey duration (minutes)"
move duration_mins durationinseconds
drop durationinseconds

* Drop observations
drop if date<mdy(2,3,2024) // pre-experiment
drop if status=="Survey Preview" & date==mdy(2,5,2024) // testing
drop if responseid=="R_1rPJSiHrKZzv1jq" // testing

* Drop variables
drop startdate enddate status progress finished recordeddate ///
	recipient*name recipientemail externalreference locationlatitude ///
	locationlongitude distributionchannel userlanguage responseid
	
* Check dates in dataset match with lead-in days 1 and 3 from timeline
preserve
	import delimited $Data/Original/timeline_individualdata.csv, varnames(1) clear	
	keep date wave factoryactivity
	keep if strpos(factoryactivity,"Lead")
	assert _N==30
	keep date wave	
	replace wave = substr(wave,6,2)	
	destring wave, replace
	g month=2 if substr(date,1,3)=="Feb"
	replace month=3 if substr(date,1,3)=="Mar"
	replace month=4 if substr(date,1,3)=="Apr"
	replace month=5 if substr(date,1,3)=="May"
	replace month=6 if substr(date,1,3)=="Jun"
	replace month=7 if substr(date,1,3)=="Jul"
	assert !mi(month)
	g year=2024
	replace date = substr(date,5,2)
	strip date, of("-") g(day)
	destring day, replace
	drop date
	g date = mdy(month,day,year)
	drop month day year
	format date %d
	sort wave date, stable
	by wave: g leadin_day_no = _n
	tempfile leadindays
	save `leadindays'
restore

merge m:1 date using `leadindays', assert(3) nogen
order wave leadin_day_no q1

la var wave "Wave number (1 to 10)"
la var leadin_day_no "Lead-in day number (1 to 3)"
ren q1 leadin_slot
replace leadin_slot = substr(leadin_slot,-1,1)
destring leadin_slot, replace
la var leadin_slot "Which daily shift (slot) is this? (1 or 2)"
la var date "Lead-in individual productivity survey date"
la var starttime "Lead-in individual productivity survey start time"

foreach var of varlist wave leadin_day_no leadin_slot date starttime duration_mins {
	assert !mi(`var')
}

* Check cases where surprisingly few surveys per slot
bys wave leadin_day_no leadin_slot: g tot_tmp = _N
tab tot_tmp
replace leadin_slot=2 if wave==8 & leadin_day_no==2 & leadin_slot==1 & _id=="EDUON" // mis-coded as slot 1
drop tot_tmp

* Check cases where hour of survey seems too late or too early given the slot
g hour_tmp = hh(starttime)
tab hour_tmp leadin_slot
replace leadin_slot=2 if wave==6 & leadin_day_no==2 & leadin_slot==1 & _id=="CVNWI" // mis-coded as slot 1
replace leadin_slot=2 if wave==7 & leadin_day_no==2 & leadin_slot==1 & _id=="YINAY" // mis-coded as slot 1
replace leadin_slot=2 if wave==1 & leadin_day_no==3 & leadin_slot==1 & _id=="YHJVC" // mis-coded as slot 1
replace leadin_slot=2 if wave==2 & leadin_day_no==1 & leadin_slot==1 & _id=="BPRAN" // mis-coded as slot 1
replace leadin_slot=2 if wave==3 & leadin_day_no==1 & leadin_slot==1 & _id=="YTQZS" // mis-coded as slot 1
replace leadin_slot=2 if wave==5 & leadin_day_no==1 & leadin_slot==1 & _id=="JJHGN" // mis-coded as slot 1
replace leadin_slot=2 if wave==9 & leadin_day_no==1 & leadin_slot==1 & _id=="DGLQR" // mis-coded as slot 1
drop hour_tmp

ren q12 overall_qual
destring overall_qual, replace

ren q2 manager_name
assert !mi(manager_name)

ren q9 how_many_participants
destring how_many_participants, replace
assert !mi(how_many_participants)
drop how_many_participants // we actually don't need this

* Prepare for reshape
ren _id id1
ren _smallbags small_bags1
ren _shift_starttime shift_start1
ren _quality quality1

local j=27
forv i=2/46 {
	ren v`j' id`i'
	local j=`j'+1
	
	ren v`j' small_bags`i'
	local j=`j'+1
	
	ren v`j' shift_start`i'
	local j=`j'+1
	
	ren v`j' quality`i'
	local j=`j'+1
}

* Get a unique ID for the submitted survey
sort wave leadin_day_no leadin_slot starttime, stable
g survey_id = _n

** Reshape
reshape long id small_bags shift_start quality, i(survey_id) j(participant)

order wave leadin_day_no leadin_slot survey_id participant
drop if mi(id)

la var survey_id "ID of submitted indiv produc survey (can be multiple submitted per shift)"
drop participant

la var id "Worker ID"
drop if id=="AAAA" // must be because entered too many participants by accident, so had to enter this for the last one
replace id="FUHMB" if strpos(id,"FUHMB")
replace id="NVSXG" if strpos(id,"NVSXG")
assert length(id)==5
replace id=upper(id)
chartab id // check only capital letters

la var overall_qual "Overall quality 0-10 (at shift-survey_id-level)"

destring small_bags, replace
assert !mi(small_bags)
la var small_bags "How many small bags did this participant make?"

la var shift_start "At what time did this participant start the shift?"
replace shift_start="13:00" if shift_start=="1.00pm"
replace shift_start="13:35" if shift_start=="1.35pm"
replace shift_start="13:45" if shift_start=="1.45pm"
replace shift_start="10:00" if shift_start=="10.00am"
replace shift_start="10:05" if shift_start=="10.05am"
replace shift_start="10:10" if shift_start=="10.10Am"
replace shift_start="10:15" if shift_start=="10.15am"
replace shift_start="10:25" if shift_start=="10.25am"
replace shift_start="10:35" if shift_start=="10.35" | shift_start=="10.35am"
replace shift_start="10:40" if shift_start=="10.40am"
replace shift_start="10:00" if shift_start=="10.00am" | shift_start=="10:00am"
replace shift_start="10:10" if strpos(shift_start,"10:10")
replace shift_start="13:00" if shift_start=="1:00" | shift_start=="1:006"
replace shift_start=subinstr(shift_start,"1:","13:",1) if substr(shift_start,1,2)=="1:"
replace shift_start=subinstr(shift_start,"2:","14:",1) if substr(shift_start,1,2)=="2:"
replace shift_start=subinstr(shift_start,"2.","14:",1) if substr(shift_start,1,2)=="2."
replace shift_start="14:00" if shift_start=="14:00pm"
replace shift_start="14:15" if shift_start=="14:15pm"
replace shift_start="14:40" if shift_start=="14:40pm"
replace shift_start="15:00" if shift_start=="3.00pm"
replace shift_start="15:00" if shift_start=="3:00"
replace shift_start="09:00" if shift_start=="9" | shift_start=="9.00am" | shift_start=="9;00"
replace shift_start=subinstr(shift_start,"am","",1) if substr(shift_start,1,2)=="9."
replace shift_start=subinstr(shift_start,"9.","09:",1) if substr(shift_start,1,2)=="9."
replace shift_start=subinstr(shift_start,"9:","09:",1) if substr(shift_start,1,2)=="9:"
replace shift_start="09:45" if strpos(shift_start,"09:45")
replace shift_start="" if inlist(shift_start,"IKHGL","JDLAN")
assert length(shift_start)==5 if !mi(shift_start)

* Check start times consistent with shifts
tab shift_start if leadin_slot==1
tab shift_start if leadin_slot==2 // SOME 9:00/9:15 THOUGH STARTTIME CONSISTENT WITH SLOT 2



* Clean quality
replace quality="10" if quality=="10:00"
replace quality="" if quality=="11" // can't be certain whether this should be 1 or 10
destring quality, replace
assert inrange(quality,0,10) | mi(quality)
la var quality "What was the quality of this participant's production? (0 to 10)"

* Check only one observation per worker ID per day
duplicates tag wave leadin_day_no id, g(dup)
drop if wave==4 & leadin_day_no==1 & id=="EGTMN" & survey_id==62 // drop the duplicate with missing quality variable
drop if wave==8 & leadin_day_no==1 & id=="WHMNJ" & survey_id==149 // Souvik and Dip both entered, keep Dip's
drop if wave==10 & leadin_day_no==1 & id=="JUVQS" & small_bags==22 // Dip entered this ID twice, keep the earliest entry
drop if wave==10 & leadin_day_no==3 & id=="XQPIQ" & survey_id==205 // Souvik and Dip both entered, keep Dip's
drop dup
isid wave leadin_day_no id



* Fix some ID typos before merging (Ids were manually entered by factory managers when collecting production data and made mistakes)
replace id = "KKAGT" if id == "KKAQT" 
replace id = "QHURE" if id == "QHVRE" 
replace id = "DEHIB" if id == "DEHIV"
replace id = "OFTTT" if id=="OATTT" & wave==8 & leadin_slot==1
replace id = "SLDVV" if id=="SLDVB" & wave==8 & leadin_slot==1
replace id = "CCPNP" if id=="CCPMP" & wave==7 & leadin_slot==1
replace id = "BJHGQ" if id=="EJHGQ" & wave==7 & leadin_slot==1
replace id = "KUTTT" if id=="KQTTT" & wave==7 & leadin_slot==1
replace id = "ORGAQ" if id=="ORJAQ" & wave==7 & leadin_slot==1
replace id = "HDEPG" if id=="SDEPG" & wave==7 & leadin_slot==1
replace id = "TXHUL" if id=="TXHUA" & wave==7 & leadin_slot==1
replace id = "TYCTS" if id=="TYSTS" & wave==1 & leadin_slot==1
replace id = "GXVSL" if id=="GXVCL" & wave==1 & leadin_slot==2
replace id = "DQQVD" if id=="DQQED" & wave==1 & leadin_slot==2
replace id = "PBRUS" if id=="PBRVS" & wave==1 & leadin_slot==2
replace id = "AIFWM" if id=="AFIWM" & wave==1 & leadin_slot==2
replace id = "GNPQH" if id=="GNPGH" & wave==1 & leadin_slot==2
replace id = "GIHDE" if id=="GHIDE" & wave==2 & leadin_slot==1
replace id = "PPJOG" if id=="PPJOJ" & wave==2 & leadin_slot==1
replace id = "THIUX" if id=="THIUH" & wave==2 & leadin_slot==1
replace id = "JEDGG" if id=="GEDGG" & wave==2 & leadin_slot==1
replace id = "GVBOP" if id=="BVBOP" & wave==3 & leadin_slot==1
replace id = "XYOKJ" if id=="HYOKJ" & wave==3 & leadin_slot==1
replace id = "MUCON" if id=="NUCON" & wave==3 & leadin_slot==1
replace id = "GIHII" if id=="SIHII" & wave==3 & leadin_slot==1
replace id = "VOEKA" if id=="VOERA" & wave==3 & leadin_slot==1
replace id = "XYOKJ" if id=="SKYKJ" & wave==3 & leadin_slot==1
replace id = "AVNKH" if id=="ABNKH" & wave==3 & leadin_slot==2
replace id = "AGUSI" if id=="AGUST" & wave==3 & leadin_slot==2
replace id = "XKBDR" if id=="XBKDR" & wave==3 & leadin_slot==2
replace id = "BVWJM" if id=="BVWJN" & wave==3 & leadin_slot==2
replace id = "RTGMI" if id=="RTJMI" & wave==3 & leadin_slot==2
replace id = "YHIUV" if id=="YHIQV" & wave==3 & leadin_slot==2
replace id = "ZNPGT" if id=="ZNEGT" & wave==4 & leadin_slot==1
replace id = "FZSNS" if id=="FZSMS" & wave==4 & leadin_slot==1
replace id = "NDSVW" if id=="NDSVM" & wave==4 & leadin_slot==1
replace id = "VFHRX" if id=="EFHRX" & wave==4 & leadin_slot==1
replace id = "QHURE" if id=="UHVRE" & wave==4 & leadin_slot==1
replace id = "OQMQB" if id=="OQMQV" & wave==4 & leadin_slot==2
replace id = "EUAPI" if id=="TUAIP" & wave==4 & leadin_slot==2
replace id = "VDTUB" if id=="VDTWB" & wave==4 & leadin_slot==2
replace id = "YQNRV" if id=="YQNRB" & wave==4 & leadin_slot==2
replace id = "NTEUF" if id=="NTUEF" & wave==4 & leadin_slot==2
replace id = "JSCWO" if id=="JSCWR" & wave==4 & leadin_slot==2
replace id = "LVVZR" if id=="LBBZR" & wave==4 & leadin_slot==2
replace id = "QQUCO" if id=="QQUCA" & wave==4 & leadin_slot==2
replace id = "WCVMZ" if id=="WCVNZ" & wave==4 & leadin_slot==2
replace id = "EKOFZ" if id=="EXYFZ" & wave==5 & leadin_slot==1
replace id = "YNFHB" if id=="UNFHB" & wave==5 & leadin_slot==1
replace id = "WZSIC" if id=="YZSIC" & wave==5 & leadin_slot==1
replace id = "JELOB" if id=="ZELOB" & wave==5 & leadin_slot==1
replace id = "NYJTC" if id=="NYZTC" & wave==5 & leadin_slot==1
replace id = "UCANE" if id=="UCALE" & wave==5 & leadin_slot==1
replace id = "YNFHB" if id=="UNFHB" & wave==5 & leadin_slot==1
replace id = "AXXTI" if id=="AXXPI" & wave==5 & leadin_slot==1
replace id = "EKOFZ" if id=="EKYFZ" & wave==5 & leadin_slot==1
replace id = "FQTNU" if id=="FQTUN" & wave==5 & leadin_slot==1
replace id = "HBEXN" if id=="HBEXM" & wave==5 & leadin_slot==1
replace id = "XOCWR" if id=="XOCVR" & wave==5 & leadin_slot==1
replace id = "XRBZZ" if id=="XRBJJ" & wave==5 & leadin_slot==1
replace id = "JHDLH" if id=="JHBLH" & wave==5 & leadin_slot==2
replace id = "ZYTVY" if id=="XYTVY" & wave==5 & leadin_slot==2
replace id = "YQGZG" if id=="ZQGZG" & wave==5 & leadin_slot==2
replace id = "JWWNK" if id=="ZWWNK" & wave==5 & leadin_slot==2
replace id = "EHUOM" if id=="EHUON" & wave==5 & leadin_slot==2
replace id = "XJCTF" if id=="XJCTA" & wave==5 & leadin_slot==2
replace id = "YQGZG" if id=="ZQGZG" & wave==5 & leadin_slot==2
replace id = "EHUOM" if id=="EHEOM" & wave==5 & leadin_slot==2
replace id = "MLOCN" if id=="MLOCA" & wave==5 & leadin_slot==2
replace id = "YQGZG" if id=="XQGZG" & wave==5 & leadin_slot==2
replace id = "DXFKH" if id=="DKFKH" & wave==6 & leadin_slot==1
replace id = "VJRFK" if id=="VGRFK" & wave==6 & leadin_slot==1
replace id = "PDVFN" if id=="PDBFM" & wave==6 & leadin_slot==1
replace id = "QOIMT" if id=="QOLMT" & wave==6 & leadin_slot==1
replace id = "IVAJL" if id=="IVJAL" & wave==6 & leadin_slot==1
replace id = "DMCPJ" if id=="DMCPG" & wave==6 & leadin_slot==2
replace id = "INTGN" if id=="INTGA" & wave==6 & leadin_slot==2
replace id = "SFBFI" if id=="SFBFF" & wave==6 & leadin_slot==2
replace id = "YBJVK" if id=="YPJVK" & wave==6 & leadin_slot==2
replace id = "DGRJF" if id=="DJRJF" & wave==6 & leadin_slot==2
replace id = "OZSJE" if id=="OZSGE" & wave==6 & leadin_slot==2
replace id = "OZSJE" if id=="OJSJE" & wave==6 & leadin_slot==2
replace id = "SYMVN" if id=="SYMVL" & wave==6 & leadin_slot==2
replace id = "PNWAC" if id=="PNWSE" & wave==7 & leadin_slot==2
replace id = "QFRBM" if id=="UFRBM" & wave==7 & leadin_slot==2
replace id = "YBXFG" if id=="YBXFS" & wave==7 & leadin_slot==2
replace id = "ZLVCU" if id=="ZLVCE" & wave==7 & leadin_slot==2
replace id = "DLJEE" if id=="DLGEE" & wave==7 & leadin_slot==2
replace id = "LVTVW" if id=="LVTBW" & wave==7 & leadin_slot==2
replace id = "NDNCI" if id=="MDNCR" & wave==7 & leadin_slot==2
replace id = "YBXFG" if id=="YBXFS" & wave==7 & leadin_slot==2
replace id = "GBYRQ" if id=="JBYRQ" & wave==7 & leadin_slot==2
replace id = "YBXFG" if id=="YBXFS" & wave==7 & leadin_slot==2
replace id = "VJHOG" if id=="VGHOG" & wave==7 & leadin_slot==2
replace id = "PGKUF" if id=="EGKUF" & wave==8 & leadin_slot==2
replace id = "VTGYB" if id=="VDGYB" & wave==8 & leadin_slot==2
replace id = "WLZJM" if id=="WLZJN" & wave==8 & leadin_slot==2
replace id = "AINXW" if id=="ANIXW" & wave==8 & leadin_slot==2
replace id = "IKYIH" if id=="LKYIH" & wave==9 & leadin_slot==1
replace id = "TKLSV" if id=="DKLSV" & wave==9 & leadin_slot==1
replace id = "OMPNK" if id=="OMPNX" & wave==9 & leadin_slot==1
replace id = "TWQYO" if id=="TWQUO" & wave==9 & leadin_slot==1
replace id = "CBXQE" if id=="CGXQE" & wave==9 & leadin_slot==2
replace id = "OQKPJ" if id=="OQKBJ" & wave==9 & leadin_slot==2
replace id = "WKCFP" if id=="WKCFB" & wave==9 & leadin_slot==2
replace id = "LXCDJ" if id=="LXCPJ" & wave==10 & leadin_slot==1
replace id = "XJIUR" if id=="XZIUR" & wave==10 & leadin_slot==1
replace id = "XZLTA" if id=="ZXLTA" & wave==10 & leadin_slot==1
replace id = "DAXKM" if id=="BAXKM" & wave==10 & leadin_slot==1
replace id = "XJIUR" if id=="XZIUR" & wave==10 & leadin_slot==1
replace id = "VGLZR" if id=="BGLZR" & wave==10 & leadin_slot==1
replace id = "PYGQW" if id=="EYGQW" & wave==10 & leadin_slot==1
replace id = "VQXKL" if id=="BQXKL" & wave==10 & leadin_slot==2
replace id = "WARKU" if id=="WAKRU" & wave==10 & leadin_slot==2
replace id = "XQPIQ" if id=="XQPLQ" & wave==10 & leadin_slot==2
replace id = "GMMZR" if id=="CNNZI" & wave==1 & leadin_slot==1 // less obvious, but likely correct
replace leadin_slot=2 if id=="DAPDH" & leadin_slot==1
replace id = "XDRET" if id=="QEIRF" & wave==2 & leadin_slot==1 
replace id = "DHZVL" if id=="SZQDN" & wave==3 & leadin_slot==2 

* Extra duplicates to drop
drop if wave==4 & leadin_day==2 & leadin_slot==2 & survey_id==77 & id=="VDTUB"
drop if id=="JKIJH" & leadin_day_no==3

tempfile produc_ind
save `produc_ind'


* Now load slots assignments
use $Data/Original/slot_schedules.dta, clear

merge 1:1 wave leadin_slot id leadin_day_no using `produc_ind', assert(1 3)

** Couple of duplicates
drop if id=="CIWJZ" & wave==3 // keep 1st instance because we have productivity data for this one
drop if id=="NPKNA" & wave==4 // drop 2nd instance
drop if id=="TVMOW" & wave==5 // drop 2nd instance

la var id "Worker ID"
la var wave "Wave number (1 to 10)"
la var leadin_slot "Which daily shift (slot) is this? (1 or 2)"
la var leadin_day_no "Lead-in day number (1 to 3)"

g has_leadin_produc_data = _merge==3
la var has_leadin_produc_data "=1 if we have lead-in produc data for this day for this worker"
drop _merge

isid id leadin_day_no

replace id = "KIINC" if id == "MOHST" 
replace id = "RPUGI" if id == "RPUEG" 

* Now merge with religion indicator
merge m:1 id using $Data/Original/individualprod_ids_religion, keep(1 3) nogen
keep if full_participant == 1

* Save Data
save $Data/Final/individual_produc_clean.dta, replace
